import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats
van_osm_data = pd.read_json('../DataProcessing/processed/van_osm_data.json')
fig_neigh = px.scatter_mapbox(van_osm_data, lat="lat", lon="lon", color="neighborhood",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
#fig.update_layout(mapbox_style="open-street-map")
fig_neigh.update_layout(mapbox_style="open-street-map",
margin = {'l':0, 'r':0, 'b':0, 't':0})
fig_neigh.show()
We can see neighborhoods near Central Vancouver has a heavy density of amenities
refined_van_osm_data = pd.read_json('../DataProcessing/processed/van_osm_data_refined.json')
fig_neigh = px.scatter_mapbox(refined_van_osm_data, lat="lat", lon="lon", color="neighborhood",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
#fig.update_layout(mapbox_style="open-street-map")
fig_neigh.update_layout(mapbox_style="open-street-map",
margin = {'l':0, 'r':0, 'b':0, 't':0})
fig_neigh.show()
neighbor_data = pd.read_csv('../DataProcessing/processed/neighborhood_info.csv')
plt.figure(figsize=(20,15))
plt.xticks(rotation=60,ha='right')
sns.barplot(x="neighborhood", y="total_amenity", data=neighbor_data)
plt.title('Number of amenities in each neighborhood', fontsize=20)
fig_neigh = px.scatter_mapbox(refined_van_osm_data, lat="lat", lon="lon", color="category",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
#fig.update_layout(mapbox_style="open-street-map")
fig_neigh.update_layout(mapbox_style="open-street-map",
margin = {'l':0, 'r':0, 'b':0, 't':0})
fig_neigh.show()
we can see sustenance (food) is more scatterly distributed in the city compared to other three categories. Transportation is heavily distributed in areas near Central Vancouver. Other two categories, arts abd leisure, are only in limited amount.
airbnb = pd.read_json('../DataProcessing/processed/airbnb_info.json')
fig_neigh = px.scatter_mapbox(airbnb, lat="lat", lon="lon", color="neighborhood",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=15, zoom=10)
#fig.update_layout(mapbox_style="open-street-map")
fig_neigh.update_layout(mapbox_style="open-street-map",
margin = {'l':0, 'r':0, 'b':0, 't':0})
fig_neigh.show()
First, we can see the categorizing model of neighborhood works pretty good for airbnb dataset. Airbnbs are seperated into 18 clusters that are closely located.
plt.figure(figsize=(20,15))
plt.xticks(rotation=60, ha='right')
sns.countplot(x = 'neighborhood', data = airbnb, palette = 'magma')
plt.title('Number of Airbnbs in each neighborhood', fontsize=20)
Interestingly, it seems neighborhoods with more amenities tend to have more airbnbs
we can test our hypothesis by a linear regression line test
Central Vancouver can be a outlier because it has a lot more amenities than other neighborhood. Therefore we exclude Central Vancouver from our test dataset
airbnb_count = airbnb.groupby('neighborhood').size().reset_index()
airbnb_count.rename(columns = {0:'total_airbnb'}, inplace=True)
neighbor_data = neighbor_data.join(airbnb_count.set_index('neighborhood'), on='neighborhood')
test_data = neighbor_data[neighbor_data['neighborhood']!='Central Vancouver']
plt.figure(figsize=(5,4))
sns.scatterplot(x="total_amenity", y="total_airbnb", data=test_data)
plt.title('Number of Amenity vs Number of Airbnb', fontsize=12)
plt.show()
reg = stats.linregress(test_data['total_amenity'], test_data['total_airbnb'])
print("p-value of linear regression slope test = ",reg.pvalue)
With all the data we have obtained, we can now explore the relationship between Airbnb score and other factors. Most of our data are not perfectly normally distributed, thus our analysis would be based on central limit theorem such that we launch our analysis with sample means of each neighborhood.
plt.figure(figsize=(10,8))
sns.barplot(x="neighborhood", y="score", data=neighbor_data)
plt.xticks(rotation=60, ha='right')
plt.title('Average score in neighborhood', fontsize=12)
plt.show()
plt.figure(figsize=(5,4))
sns.scatterplot(x="avg_price", y="score", data=test_data)
plt.title('Average Price vs Average Score', fontsize=12)
plt.show()
reg = stats.linregress(test_data['avg_price'], test_data['score'])
print("p-value of linear regression slope test = ",reg.pvalue)
# without central limit theorem, test on each individual airbnb
# we can hardly tell anything from this
plt.figure(figsize=(5,4))
sns.scatterplot(x="sustenance", y="avg_score", data=airbnb)
plt.title('Number of sustenance vs Average Score', fontsize=12)
plt.show()
# with central limit theorem, test on neighborhood
plt.figure(figsize=(5,4))
sns.scatterplot(x="sustenance", y="score", data=test_data)
plt.title('Number of sustenance vs Average Score', fontsize=12)
plt.show()
reg = stats.linregress(test_data['sustenance'], test_data['score'])
print("p-value of linear regression slope test = ",reg.pvalue)
plt.figure(figsize=(5,4))
sns.scatterplot(x="transportation", y="score", data=test_data)
plt.title('Number of sustenance vs Average Score', fontsize=12)
plt.show()
reg = stats.linregress(test_data['sustenance'], test_data['score'])
print("p-value of linear regression slope test = ",reg.pvalue)